from IPython.display import display
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")
%matplotlib inline
data_16 = pd.read_csv(r'C:\New_partition\Thesis\Files\Files\shap\machine_16.csv',index_col=0)
data_0 = pd.read_csv(r'C:\New_partition\Thesis\Files\Files\shap\machine_0.csv',index_col=0)
data_9 = pd.read_csv(r'C:\New_partition\Thesis\Files\Files\shap\machine_9.csv',index_col=0)
#!pip install cufflinks
import cufflinks as cf
import plotly
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.offline as py
import plotly.graph_objs as go
cf.go_offline() # required to use plotly offline (no account required).
py.init_notebook_mode() # graphs charts inline (IPython).
import plotly.express as px
import numpy as np
I am visualizing data from multiple machines to see a trend of data.
pio.renderers.default='notebook'
fig_2= px.line(data_16.iloc[:,0:4])
fig_2.show()
pio.renderers.default='notebook'
fig_3= px.line(data_9.iloc[:,0:4])
fig_3.show()
I have found in both the machines, the data plotted above follows the same pattern. There are some clear outliers. By zooming the data points I found values greater than 260 and less than -260 are in range of outliers.I have also observed that during the normal period the data values of sensors are greater than 0. In failure it is close to zero and in case of fault it peaks really high.
#data=data[~( (data.iloc[:,1:2] > 260) | (data.iloc[:,1:2]<-260) ).all(axis=1) & ~( (data.iloc[:,0:1] > 260) | (data.iloc[:,0:1]<-260) ).all(axis=1) & ~( (data.iloc[:,2:3] > 260) | (data.iloc[:,2:3]<-260) ).all(axis=1) & ~( (data.iloc[:,3:4] > 260) | (data.iloc[:,3:4]<-260) ).all(axis=1) ]
#plt.plot(range(len(data)), data)
#plt.show()
data_16.columns
# labelling the columns as S1, S2, S3,S4 which represent sensor1, sensor2, sensor3 and sensor4
data_16.columns=['S1','S2','S3','S4']
#detecting the outliers
data_16[((data_16 > 260) | (data_16 <-260))] = np.nan
# replacing the outliers with the mean of last non outlier window of that sensor
for k in range(0,4):
indices = np.where(data_16.iloc[:,k:k+1].isna())[0]
for i in indices:
data_16.iloc[i:i+1,k:k+1]=data_16.iloc[0:i,k:k+1].mean()[0]
pio.renderers.default='notebook'
fig_4= px.line(data_16.iloc[:,0:4])
fig_4.show()
data_m=data_16.diff()
pio.renderers.default='notebook'
fig_5= px.line(data_m.iloc[:,0:4])
fig_5.show()
data_16['label']='0'
data_16.head(10)
#data_m.to_csv(r'C:\New_partition\To_delete\data_wo_out.csv')
Based on a specific threshold value I am labelling the data points as fault, failure and normal. I have 3 specific criterais:
# based on a specific threshold value I am labelling the data points as fault, failure and normal
for i in range(0,len(data_m)):
if (abs(data_m.S1[i])<=1) & (abs(data_m.S2[i])<=1) & (abs(data_m.S3[i])<=1) & (abs(data_m.S4[i])<=1):
data_16.label[i]='failure'
elif ((data_m.S1[i]<=108) & (data_m.S1[i]>=-108)) & ((data_m.S2[i]<=108) & (data_m.S2[i]>=-108)) & ((data_m.S3[i]<=108) & (data_m.S3[i]>=-108)) & ((data_m.S4[i]<=108) & (data_m.S4[i]>=-108)):
data_16.label[i]='normal'
elif ((data_m.S1[i]>=108) or (data_m.S1[i]<=-108)) or ((data_m.S2[i]>=108) or (data_m.S2[i]<=-108)) or ((data_m.S3[i]>=108) or (data_m.S3[i]<=-108)) or ((data_m.S4[i]>=108) or (data_m.S4[i]<=-108)):
#print(data_m.a[i])
#print(data_m.b[i])
#print(data_m.c[i])
#print(data_m.d[i])
data_16.label[i]='fault'
data_16=data_16[data_16['label']!='0']
data_16
import seaborn as sns
import matplotlib.pyplot as plt
fig_6 = px.scatter(data_16, x=data_16.index, y=data_16.S1, color="label")
fig_7 = px.scatter(data_16, x=data_16.index, y=data_16.S2, color="label")
fig_8 = px.scatter(data_16, x=data_16.index, y=data_16.S3, color="label")
fig_9 = px.scatter(data_16, x=data_16.index, y=data_16.S4, color="label")
pio.renderers.default='notebook'
fig_6.show()
fig_7.show()
fig_8.show()
fig_9.show()
The above graph shows the data points of each sensor. Color signifies which sensor data points are classified normal, fault and failure.
My approach will pipoint the time as soon as it detects the sensor value lies in fault region and failure region. This would help ExampleCo.Inc to quickly take actions. This is show in the figure above. You can see the data points which are classified as faulty, normal and failure.
I have set a hard threshold based on visualization of data and for detecting fault and failure. Setting a hard threshold is not effective because if the fault or failure value at some point in time lies a little below, my approach might miss the fault.
I could modify my existing approach by using a median filter or other denoising technique to smooth out the data because it is mentioned in the description "When a machine is operating in normal mode the data behaves in a fairly predictable way, but with a moderate amount of noise." It will be effective to smooth the signal but not too much as to change the value. Also I could utilize another technique to remove outliers such as using Z scores, interquartile range to create outlier fences.